In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_boston
from sklearn import linear_model
In [2]:
%matplotlib inline
# If you are using IPython, this will make the images available in the notebook
In [3]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
#1. CRIM: per capita crime rate by town
#2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
#3. INDUS: proportion of non-retail business acres per town
#4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#5. NOX: nitric oxides concentration (parts per 10 million)
#6. RM: average number of rooms per dwelling
#7. AGE: proportion of owner-occupied units built prior to 1940
#8. DIS: weighted distances to five Boston employment centres
#9. RAD: index of accessibility to radial highways
#10. TAX: full-value property-tax rate per $10,000
#11. PTRATIO: pupil-teacher ratio by town
#12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#13. LSTAT: % lower status of the population
#14. MEDV: Median value of owner-occupied homes in $1000's
In [4]:
observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.ix[:,:-1]
y = dataset['target'].values
X.head()
Out[4]:
In [5]:
X.describe()
Out[5]:
In [6]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [7]:
Xc = sm.add_constant(X)
linear_regression = sm.OLS(y,Xc)
fitted_model = linear_regression.fit()
Xc.head()
Out[7]:
In [8]:
# As an alternative, this example is using the statsmodels.formula.api module
# Equivalent to the R syntax for linear models, it requires to specify the form
# of the linear regression using 'response ~ predictor1 (+ predictor2 + ...)'
#linear_regression = smf.ols(formula = 'target ~ CRIM + ZN +INDUS + CHAS + NOX + RM + AGE + DIS +\
# RAD + TAX + PTRATIO + B + LSTAT', data=dataset)
#fitted_model# = linear_regression.fit(#)
In [9]:
fitted_model.summary()
Out[9]:
In [10]:
X = dataset.ix[:,:-1]
correlation_matrix = X.corr()
print (correlation_matrix)
In [11]:
def visualize_correlation_matrix(data, hurdle = 0.0):
R = np.corrcoef(data, rowvar=0)
R[np.where(np.abs(R)<hurdle)] = 0.0
heatmap = plt.pcolor(R, cmap=mpl.cm.coolwarm, alpha=0.8)
heatmap.axes.set_frame_on(False)
heatmap.axes.set_yticks(np.arange(R.shape[0]) + 0.5, minor=False)
heatmap.axes.set_xticks(np.arange(R.shape[1]) + 0.5, minor=False)
heatmap.axes.set_xticklabels(variables, minor=False)
plt.xticks(rotation=90)
heatmap.axes.set_yticklabels(variables, minor=False)
plt.tick_params(axis='both', which='both', bottom='off', top='off', left = 'off', right = 'off')
plt.colorbar()
plt.show()
visualize_correlation_matrix(X, hurdle=0.5)
In [12]:
corr = np.corrcoef(X, rowvar=0)
eigenvalues, eigenvectors = np.linalg.eig(corr)
In [13]:
print (eigenvalues)
In [14]:
print (eigenvectors[:,8])
In [15]:
print (variables[2], variables[8], variables[9])
In [16]:
from sklearn.preprocessing import StandardScaler
observations = len(dataset)
variables = dataset.columns
standardization = StandardScaler()
Xst = standardization.fit_transform(X)
original_means = standardization.mean_
originanal_stds = standardization.std_
Xst = np.column_stack((Xst,np.ones(observations)))
y = dataset['target'].values
In [17]:
import random
import numpy as np
def random_w( p ):
return np.array([np.random.normal() for j in range(p)])
def hypothesis(X,w):
return np.dot(X,w)
def loss(X,w,y):
return hypothesis(X,w) - y
def squared_loss(X,w,y):
return loss(X,w,y)**2
def gradient(X,w,y):
gradients = list()
n = float(len( y ))
for j in range(len(w)):
gradients.append(np.sum(loss(X,w,y) * X[:,j]) / n)
return gradients
def update(X,w,y, alpha=0.01):
return [t - alpha*g for t, g in zip(w, gradient(X,w,y))]
def optimize(X,y, alpha=0.01, eta = 10**-12, iterations = 1000):
w = random_w(X.shape[1])
path = list()
for k in range(iterations):
SSL = np.sum(squared_loss(X,w,y))
new_w = update(X,w,y, alpha=alpha)
new_SSL = np.sum(squared_loss(X,new_w,y))
w = new_w
if k>=5 and (new_SSL - SSL <= eta and new_SSL - SSL >= -eta):
path.append(new_SSL)
return w, path
if k % (iterations / 20) == 0:
path.append(new_SSL)
return w, path
alpha = 0.01
w, path = optimize(Xst, y, alpha, eta = 10**-12, iterations = 20000)
print ("These are our final standardized coefficients: " + ', '.join(map(lambda x: "%0.4f" % x, w)))
In [18]:
unstandardized_betas = w[:-1] / originanal_stds
unstandardized_bias = w[-1]-np.sum((original_means / originanal_stds) * w[:-1])
print ('%8s: %8.4f' % ('bias', unstandardized_bias))
for beta,varname in zip(unstandardized_betas, variables):
print ('%8s: %8.4f' % (varname, beta))
In [19]:
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
# ATTENTION: Normalization = x -xmin/ xmax – xmin Zero Score Standardization = x - xmean/ xstd
In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
standardization = StandardScaler()
Stand_coef_linear_reg = make_pipeline(standardization, linear_regression)
In [21]:
linear_regression.fit(X,y)
for coef, var in sorted(zip(map(abs,linear_regression.coef_), dataset.columns[:-1]), reverse=True):
print ("%6.3f %s" % (coef,var))
In [22]:
Stand_coef_linear_reg.fit(X,y)
for coef, var in sorted(zip(map(abs,Stand_coef_linear_reg.steps[1][1].coef_), dataset.columns[:-1]), reverse=True):
print ("%6.3f %s" % (coef,var))
In [23]:
from sklearn.metrics import r2_score
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
def r2_est(X,y):
return r2_score(y,linear_regression.fit(X,y).predict(X))
print ('Baseline R2: %0.3f' % r2_est(X,y))
In [24]:
r2_impact = list()
for j in range(X.shape[1]):
selection = [i for i in range(X.shape[1]) if i!=j]
r2_impact.append(((r2_est(X,y) -r2_est(X.values[:,selection],y)) ,dataset.columns[j]))
for imp, varname in sorted(r2_impact, reverse=True):
print ('%6.3f %s' % (imp, varname))
In [25]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
create_interactions = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
In [26]:
def r2_est(X,y):
return r2_score(y,linear_regression.fit(X,y).predict(X))
baseline = r2_est(X,y)
print ('Baseline R2: %0.3f' % baseline)
In [27]:
Xi = create_interactions.fit_transform(X)
main_effects = create_interactions.n_input_features_
In [28]:
for k,effect in \
enumerate(create_interactions.powers_[(main_effects):]):
termA, termB = variables[effect==1]
increment = r2_est(Xi[:,list(range(0,main_effects)) \
+ [main_effects+k]],y) - baseline
if increment > 0.01:
print ('Adding interaction %8s *%8s R2: %5.3f' % (termA, \
termB, increment))
In [29]:
Xi = X
Xi['interaction'] = X['RM']*X['LSTAT']
print ('R2 of a model with RM*LSTAT interaction: %0.3f' % r2_est(Xi,y))
In [30]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
create_cubic = PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
create_highdegree = PolynomialFeatures(degree=7, interaction_only=False, include_bias=False)
linear_predictor = make_pipeline(linear_regression)
#quadratic_predictor = make_pipeline(create_quadratic, linear_regression)
cubic_predictor = make_pipeline(create_cubic, linear_regression)
In [31]:
print(variables)
predictor = 'LSTAT'
x = dataset['LSTAT'].values.reshape((observations,1))
xt = np.arange(0,50,0.1).reshape((50/0.1,1))
x_range = [dataset[predictor].min(),dataset[predictor].max()]
y_range = [dataset['target'].min(),dataset['target'].max()]
scatter = dataset.plot(kind='scatter', x=predictor, y='target', xlim=x_range, ylim=y_range)
regr_line = scatter.plot(xt, linear_predictor.fit(x,y).predict(xt), '-', color='red', linewidth=2)
In [32]:
scatter = dataset.plot(kind='scatter', x=predictor, y='target', xlim=x_range, ylim=y_range)
regr_line = scatter.plot(xt, cubic_predictor.fit(x,y).predict(xt), '-', color='red', linewidth=2)
In [33]:
for d in [1,2,3,5,15]:
create_poly = PolynomialFeatures(degree=d, interaction_only=False, include_bias=False)
poly = make_pipeline(create_poly, StandardScaler(), linear_regression)
model = poly.fit(x,y)
print ("R2 degree - %2i polynomial :%0.3f" %(d,r2_score(y,model.predict(x))))
In [34]:
scatter = dataset.plot(kind='scatter', x=predictor, y='target', xlim=x_range, ylim=y_range)
regr_line = scatter.plot(xt, model.predict(xt), '-', color='red', linewidth=2)
In [ ]: